In [1]:
import time    

#time measuring decorator that is called as a wrapper by @timeit above func names
def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()

        #print('%r (%r, %r) %2.2f sec' %(method.__name__, args, kw, te-ts))
        print('%r %2.5f sec' %(method.__name__, te-ts))
        return result
    return timed

from collections import OrderedDict

handles, labels = plt.gca().get_legend_handles_labels()
by_label = OrderedDict(zip(labels, handles))
plt.legend(by_label.values(), by_label.keys())

In [4]:



Out[4]:
<matplotlib.legend.Legend at 0x3d2969e048>

In [5]:
import pandas as pd
import logging
import numpy as np
import sys
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from scipy.optimize import minimize

In [24]:
def backtrack_line_search(fx, gradient, X, y, theta, alpha):
    tau = c = 0.5
    p = -gradient
    m = np.dot(p, gradient)
    t = c*m
    step = (compute_square_loss((theta.T + alpha*p).flatten(), X, y))
    #  Armijo-Goldstein cond: f(x) - f(x+ alpha_j * p) >= alpha_j * t
    while step- fx > alpha*t :
        alpha *= tau
        step = (compute_square_loss((theta.T + alpha*p).flatten(), X, y))
    return alpha

def compute_square_loss(theta, X, y):
    return ((np.linalg.norm(np.dot(X,theta) - y))**2)/(2*X.shape[0])

def r_sq_loss(theta, X, y, lambda_reg):
    return compute_square_loss(theta, X,y) + lambda_reg*np.dot(theta.T, theta)

def compute_regularized_square_loss_gradient(theta, X, y, lambda_reg):
    residual = np.dot(X, theta) - y 
    
    return np.matmul(X.T, residual )/(X.shape[0]) + 2*theta*lambda_reg

def regularized_grad_descent(
    X, y,  lambda_reg, alpha=5, num_iter=1000, l_search = True):

    (num_instances, num_features) = X.shape
    theta = np.append(np.ones(num_features-1), 1) #Initialize theta w/ bias
    theta_hist = np.zeros((num_iter, num_features))  #Initialize theta_hist
    loss_hist = np.zeros(num_iter) #initialize loss_hist
    a_hist = [alpha]
    
    for i in range(num_iter):
        
        loss = compute_loss(theta, X, y)
        loss_hist[i] = loss
        
        gradient = compute_regularized_square_loss_gradient(theta, X, y, lambda_reg)
        #if i != 0 and loss >= loss_hist[i-1]:#(np.dot(gradient.T, gradient) <= 1e-6):
            #print(a_hist)
        #   return(loss_hist[0:i-1], theta_hist[0:i-1, :])
        
        if l_search:
            alpha= backtrack_line_search(loss, gradient, X, y, theta, alpha)
            a_hist.append(round(alpha,4))
        
        theta -= alpha*gradient
        theta_hist[i,:] = theta
        
    #print(a_hist)
    return loss_hist, theta_hist

def compute_loss(theta, X, y):
    return ((np.linalg.norm(np.dot(X,theta) - y))**2)/(2*X.shape[0])

# def ridge(Lambda, X, y, theta):
#     def ridge_obj(theta):
#         return compute_loss(theta, X, y) + Lambda*(np.linalg.norm(theta))**2
#     return ridge_obj

In [7]:
def get(Q):
    data = [['X_train.txt', 'y_train.txt'],
            ['X_valid.txt', 'y_valid.txt'],['X_test.txt', 'y_test.txt']]
    X = np.loadtxt(data[Q][0])
    y = np.loadtxt(data[Q][1])
    #print(X.shape, y.shape)
    return X, y

def feature_normalization(train, valid, test):
    for col in range(train.shape[1]):
        minimum = np.nanmin(train[:,col])
        train[:, col] -= minimum
        test[:, col] -= minimum
        valid[:, col] -= minimum
        
        max_ = np.nanmax(train[:,col])
        train[:,col] /= max_
        test[:,col] /= max_
        valid[:, col] /= max_
    return train, valid, test

X_train, y_train = get(0)
X_valid, y_valid = get(1)
X_test, y_test = get(2)

#X_train, X_valid, X_test = feature_normalization(X_train, X_valid, X_test)

In [13]:
#L = [10**x for x in range(-5, 6)]
L = np.logspace(-6, 1, num=16)
loss = [[],[],[]]
for Lambda in L: #range(-5,6):
    #Lambda = 10**i
    
    #clf = Ridge(alpha=Lambda)
    #clf.fit(X_train, y_train)
    #loss[0].append(compute_loss(clf.coef_, X_valid, y_valid))
    
    lossy, theta = regularized_grad_descent(
        X_train, y_train, lambda_reg = Lambda, alpha =5, num_iter=1000)
    loss[1].append(compute_loss(theta[-1], X_valid, y_valid))
    loss[2].append(lossy[-1])
    #w_opt = minimize(ridge(Lambda,  X_train, y_train, w), w)
    #loss[2].append(compute_loss(w_opt.x, X_valid, y_valid))

plt.plot(np.log10(L), loss[1], 'r', label='validation loss')
plt.plot(np.log10(L), loss[2], 'c', label='training loss')
#plt.plot(np.log10(L), loss[2], 'k', label='minimize', linestyle='--')
plt.xlabel('Regularizer in log10')
plt.ylabel('Square Loss')
plt.legend(loc='upper left')
plt.show()



In [ ]:
#sparsity analysis for ridge regression
#lambda = 10e-5, tol = 0 -> minimize has 0% sparsity (no true zeros)
#lambda = 10e-5, tol = 10e-3 -> minimize has 6% sparsity 
tol = 10e-3
L = np.logspace(-7, 1, num=50)
percentages = [[],[],[] ]
#L = np.logspace(-5, 5, num=10)
for Lambda in L:
    #Lambda = 10**i
    
    clf = Ridge(alpha=Lambda)
    clf.fit(X_train, y_train)
    percentages[0].append(sum(abs(clf.coef_) <= tol)/D )
    
    loss, theta = regularized_grad_descent(
        X_train, y_train, lambda_reg = Lambda, alpha =5, num_iter=500)
    percentages[1].append(sum(abs(theta[-1]) <= tol)/D )
    
    w_opt = minimize(ridge(Lambda,  X_train, y_train, w), w)
    percentages[2].append(sum(abs(w_opt.x) <= tol)/D )

plt.plot(np.log10(L), percentages[0], 'r', label='sklearn ridge')
#plt.plot(np.log10(L), percentages[1], 'c', label='handwritten')
plt.plot(np.log10(L), percentages[2], 'k', label='minimize', linestyle='--')
plt.xlabel('Regularizer in log10')
plt.ylabel('Sparsity w/ 10e-3 tolerance')
plt.legend(loc='upper right')
plt.show()

In [25]:
def ridge(X, y, L):
    X2 = np.matmul(X.T, X) 
    X2[np.diag_indices_from(X2)] += L
    X2 = np.linalg.inv(X2)
    w = np.matmul( X2, np.matmul(X.T, y))
    return w 
#@timeit 
def vec_lasso(X, y, w, L, num_iter = 100):
    (num_samp, num_fea) = X.shape
    # ridge regression to init
    #w = ridge(X, y, L)
    #print(y.shape, w.shape, X.T.shape)
    for run in range(num_iter):
        
        for j in range(num_fea):
            a_j = 2*np.dot(X[:,j],X[:,j])
            
            vec = y- np.matmul(w.T, X.T) + (w*X)[:,j]
            #X2 = w* X
            #vec2 = vec + X2[:,j]  
            c_j = 2*np.dot(X[:,j], vec)

            a = c_j/a_j
            delta = L/a_j
            w[j] = np.sign(a)* max((abs(a) - delta), 0)
    return w

In [16]:
#searching for best lambda using validation square error 
L = np.logspace(-4, 1, num=20)
lasso_loss = []
train = []

for Lambda in L:
    w = ridge(X_train, y_train, L=Lambda)
    theta = vec_lasso(X_train, y_train, w, L = Lambda, num_iter = 1000)
    lasso_loss.append(compute_loss(theta, X_valid, y_valid)) 
    train.append(compute_loss(theta, X_train, y_train))
plt.plot(np.log10(L), lasso_loss, 'r', label='validation loss')
plt.plot(np.log10(L), train, 'b', label='training loss')

plt.xlabel('Regularizer in log10')
plt.ylabel('Lasso Loss ')
plt.legend(loc='upper left')
plt.show()



In [43]:
loss = []
w = ridge(X_train, y_train, 1)
for i in range(1000):
    w = vec_lasso(X_train, y_train, w, L = 1, num_iter = 1)
    loss.append(compute_loss(w, X_test, y_test))
    
plt.plot([x for x in range(1000)], loss, 'b', label='regularized test loss')
plt.legend()
plt.show()



In [ ]:


In [44]:
lambdas = np.logspace(-1, 4, 100)
coefs = []
for lamb in lambdas:
    w = ridge(X_train, y_train, L=lamb)
    theta = vec_lasso(X_train, y_train, w, lamb, num_iter = 1000)
    coefs.append(theta) 

ax = plt.gca()

ax.plot(lambdas, coefs)
#ax.axvline(30)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
plt.xlabel('Lambda')
plt.ylabel('weights')
plt.title('Lasso coefficients as a function of regularization')
plt.axis('tight')
plt.show()



In [ ]:
tol = 0
w = ridge(X_train, y_train, L=1)
theta = vec_lasso(X_train, y_train, w, 1, num_iter = 2000)
compute_loss(theta, X_test, y_test)
#print(sum(abs(theta) <= tol))

In [ ]:
#@timeit
def lasso(X, y, w, L, num_iter = 100):
    (num_samp, num_fea) = X.shape

    for run in range(num_iter):
        for j in range(num_fea):
            
            a_j = 0
            c_j = 0
            for i in range(num_samp):
                
                a_j += 2*(X[i,j]**2)
            #a_j = 2*np.dot(X[:,j],X[:,j])
                
                #vec = y- np.matmul(w.T, X.T) + (w*X)[:,j]
            
                c_j += 2* X[i,j]*(y[i] - np.dot(w.T, X[i,:]) + w[j]*X[i,j])
                                  
            a = c_j/a_j
            delta = L/a_j
            w[j] = np.sign(a)* max((abs(a) - delta), 0)
    return w

In [ ]:
#@timeit
def lasso_homotopy(X, y, L):
    (num_samp, num_fea) = X.shape
    # ridge regression to init
    w = np.zeros(X.shape[1])
    #print(y.shape, w.shape, X.T.shape)
    
    while L >= 0:
        L -= 0.001
        w = vec_lasso(X, y, w, L, num_iter = 1)
    
    #print(compute_loss(w, X, y))
    return w

#for lamb in [0, 0.1, 1, 10, 100]:
 #   lasso_homotopy(X_train, y_train, lamb)

In [ ]:
@timeit
def f(inp):
    L = np.logspace(-4, 1, num=20)
    for lamb in L:
        lasso(X_train, y_train, w, L=10e-1, num_iter = 1000)
        #vec_lasso(X_train, y_train, w, L=10e-1, num_iter = 1000)
    #lasso(X_train, y_train, w, L = lamb, num_iter=200)
    return inp
l = f(7)

homotopy method over regularization path L took 139.46233 s

vectorized lasso over L took 126.85214 s

basic lasso over L took 1603 s


In [ ]: